#script to match CEMS with NEI

#CEMS and NEI can be matched by these things: ORISPL, ORISUN, Fuel, distance (LAT, LON)
#I rank matching perfection by match_qual_flag
#match_qual_flag = 0: matched by  ORISPL, ORISUN, Fuel and passed distance sanity check
#match_qual_flag = 1: matched by  ORISPL, ORISUN,      and passed distance sanity check
#match_qual_flag = 2: matched by  ORISPL,               Fuel and passed distance sanity check
#match_qual_flag = 3: matched by  ORISPL,                    and passed distance sanity check
#match_qual_flag = 4: matched by                        Fuel and        distance (mutually closest)
#match_qual_flag = 5: matched by                                        distance (mutually closest)
#not matched: matched by nothing, artificial stack info assigned

#############################################################################################################################
#install libraries
#install.packages("plyr")
library(plyr)
#install.packages("dplyr")
library(dplyr)
#install.packages("readr")
library(readr)
#install.packages("psych")
library(psych)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("ggpubr")
library(ggpubr)
#install.packages("geosphere")
library(geosphere)

#############################################################################################################################
#set working directory
setwd ("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_CEMS_NEI_EGU/spreadsheets/output_202108")

#############################################################################################################################
#read prepared .csv files
CEMS <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_CEMS_NEI_EGU/spreadsheets/input_2021/CEMS_202108_scaled_to_annual_tot.csv")
NEI <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_CEMS_NEI_EGU/spreadsheets/input_2021/NEI17_EGU_Info_v2.csv")

#remove NA rows at the end
CEMS = CEMS %>% filter(ORISPL!= 'NA')

#############################################################################################################################
#rename columns to make it less confusing for later coding
colnames(CEMS)[colnames(CEMS) == "LON"] ="LON_CEMS"
colnames(CEMS)[colnames(CEMS) == "LAT"] ="LAT_CEMS"

colnames(NEI)[colnames(NEI) == "EGU_SCC_Group"] ="Fuel"
colnames(NEI)[colnames(NEI) == "LONGITUDE"] ="LON_NEI"
colnames(NEI)[colnames(NEI) == "LATITUDE"] ="LAT_NEI"
colnames(NEI)[colnames(NEI) == "ORIS_FACILITY_CODE"] ="ORISPL"
colnames(NEI)[colnames(NEI) == "ORIS_BOILER_ID"] ="ORISUN"
colnames(NEI)[colnames(NEI) == "CO"] ="CO_NEI"
colnames(NEI)[colnames(NEI) == "CO2"] ="CO2_NEI"
colnames(NEI)[colnames(NEI) == "NH3"] ="NH3_NEI"
colnames(NEI)[colnames(NEI) == "NOX"] ="NOX_NEI"
colnames(NEI)[colnames(NEI) == "PM10.PRI"] ="PM10.PRI_NEI"
colnames(NEI)[colnames(NEI) == "PM25.PRI"] ="PM25.PRI_NEI"
colnames(NEI)[colnames(NEI) == "SO2"] ="SO2_NEI"
colnames(NEI)[colnames(NEI) == "VOC"] ="VOC_NEI"

#existing columns cannot be unique identifier
#assign a unique identifier to NEI rows, these are assigned to the NEI_EGU_info_v2 rows, Colin provided. So, it is easy to relate
for (row in 1:nrow(NEI)) {
  NEI[row, "NEI_EGU_ID"] = row
}

#output NEI with unique EGU identifier for record
write.csv(NEI, file = "NEI_EGU_ID.csv")
#############################################################################################################################
#prepare data

##########################
#rename fuel types in CEMS$Fuel to the same format as in NEI$Fuel
for (row in 1:nrow(CEMS)) {
  Fuel <- CEMS[row, "Fuel"]
  if (Fuel == 'Pipeline Natural Gas' | Fuel == 'Natural Gas' | Fuel == 'Other Gas'
      | Fuel == 'Coal, Natural Gas'| Fuel == 'Process Gas'| Fuel == 'Diesel Oil, Pipeline Natural Gas'
      | Fuel == 'Other Gas, Pipeline Natural Gas'| Fuel == 'Natural Gas, Pipeline Natural Gas'
      | Fuel == 'Coal, Pipeline Natural Gas'){
    CEMS[row, "Fuel"] = 'EGU_NG'
  }
  if (Fuel == 'Coal' | Fuel == 'Petroleum Coke' | Fuel == 'Other Solid Fuel'
      | Fuel == 'Coal Refuse'){
    CEMS[row, "Fuel"] = 'EGU_Coal'
  }
  if (Fuel == 'Other Oil' | Fuel == 'Diesel Oil' | Fuel == 'Tire Derived Fuel'
      | Fuel == 'Residual Oil'){
    CEMS[row, "Fuel"] = 'EGU_Oil'
  }
  if (Fuel == 'Wood'){
    CEMS[row, "Fuel"] = 'EGU_BIO'
  }
}

#remove EGU_BIO from CEMS and NEI data set, because Biomass is not fossil fuel
CEMS = CEMS %>% filter(Fuel != 'EGU_BIO')
NEI = NEI %>% filter(Fuel != 'EGU_BIO')

CEMS_NOx_noBIO_tot = sum(CEMS$Annual_NOx_Emis_MetricTon)
CEMS_SO2_noBIO_tot = sum(CEMS$Annual_SO2_Emis_MetricTon)
CEMS_CO2_noBIO_tot = sum(CEMS$Annual_CO2_Emis_MetricTon)

##########################
#STKDIAM, STKFLOW, STKVEL can be calculated knowing two of the three
#Q = V*A = V*pi*(d/2)^2

#Calculate STKDIAM if it is NA by d = r*2 = sqrt((Q/V)/pi)
for (row in 1:nrow(NEI)) {
  STKDIAM <- NEI[row, "STKDIAM"]
  STKFLOW <- NEI[row, "STKFLOW"]
  STKVEL <- NEI[row, "STKVEL"]
  if (is.na(STKDIAM) & !is.na(STKFLOW) & !is.na(STKVEL)){
    NEI[row, "STKDIAM"] = (sqrt((STKFLOW/STKVEL)/pi))*2
  }
}


#Calculate STKFLOW if it is NA by Q(m3/s)= V(m/s)*A(m2) = STKVEL*pi*(STKDIAM/2)^2
for (row in 1:nrow(NEI)) {
  STKDIAM <- NEI[row, "STKDIAM"]
  STKFLOW <- NEI[row, "STKFLOW"]
  STKVEL <- NEI[row, "STKVEL"]
  if (!is.na(STKDIAM) & is.na(STKFLOW) & !is.na(STKVEL)){
    NEI[row, "STKFLOW"] = STKVEL*pi*(STKDIAM/2)^2
  }
}


#Calculate STKVEL if it is NA by V=Q/A=Q/(pi*(d/2)^2)
for (row in 1:nrow(NEI)) {
  STKDIAM <- NEI[row, "STKDIAM"]
  STKFLOW <- NEI[row, "STKFLOW"]
  STKVEL <- NEI[row, "STKVEL"]
  if (!is.na(STKDIAM) & !is.na(STKFLOW) & is.na(STKVEL)){
    NEI[row, "STKVEL"] = STKFLOW/(pi*(STKDIAM/2)^2)
  }
}

##########################
#remove rows from NEI if there is no stack info
NEI_wsi = NEI %>% filter(STKHGT != 'NA'& STKDIAM != 'NA' & STKTEMP != 'NA' & STKVEL != 'NA' & STKFLOW != 'NA') #13139

#############################################################################################################################
#CEMS and NEI can be matched by these things: ORISPL, ORISUN, Fuel, distance (LAT, LON)
#I rank matching perfection by match_qual_flag
#match_qual_flag = 0: matched by  ORISPL, ORISUN, Fuel and passed distance sanity check
#match_qual_flag = 1: matched by  ORISPL, ORISUN,      and passed distance sanity check
#match_qual_flag = 2: matched by  ORISPL,               Fuel and passed distance sanity check
#match_qual_flag = 3: matched by  ORISPL,                    and passed distance sanity check
#match_qual_flag = 4: matched by                        Fuel and        distance
#match_qual_flag = 5: matched by                                        distance
#not matched: matched by nothing, artificial stack info assigned

#############################################################################################################################
#match_qual_flag = 0: matched by  ORISPL, ORISUN, Fuel and passed distance sanity check

##########################
#prepare NEI_wsi_match0_pool array
NEI_wsi_match0_pool = NEI_wsi %>% filter(ORISPL != 'NA' & ORISUN != 'NA' & Fuel != 'NA') #4555 #if a NEI point has ORISPL it also has ORISUN, every row of NEI has fuel

#matching each row of CEMS with NEI_wsi_match0_pool by  ORISPL, ORISUN, Fuel
max_mflg0_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(CEMS)) {
  ORISPL_CEMS <- CEMS[rr_cems, "ORISPL"]
  ORISUN_CEMS <- CEMS[rr_cems, "ORISUN"]
  Fuel_CEMS <- CEMS[rr_cems, "Fuel"]
  match_NEI_EGU_ID = 0
  for (rr_nei in 1:nrow(NEI_wsi_match0_pool)) {
    ORISPL_NEI <- NEI_wsi_match0_pool[rr_nei, "ORISPL"]
    ORISUN_NEI <- NEI_wsi_match0_pool[rr_nei, "ORISUN"]
    Fuel_NEI <- NEI_wsi_match0_pool[rr_nei, "Fuel"]
    if (!is.na(ORISPL_NEI)){
      if (ORISPL_CEMS == ORISPL_NEI & ORISUN_CEMS == ORISUN_NEI & Fuel_CEMS == Fuel_NEI){
        match_NEI_EGU_ID = match_NEI_EGU_ID + 1
        #write down all the NEI NEI_EGU_ID that meet the matching criteria
        CEMS[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match0_pool[rr_nei, "NEI_EGU_ID"]
      }
    }
  }
  if (match_NEI_EGU_ID > max_mflg0_NEI_EGU_ID){
    max_mflg0_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of CEMS that are match flag 0 pre-DSC
CEMS_mflg0_preDSC = CEMS %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg0_NEI_EGU_ID) subsets of CEMS_mflg0_preDSC
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg0_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg0_preDSC_NEI_EGU_IDi = subset(CEMS_mflg0_preDSC, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg0_preDSC_NEI_EGU_IDi)[colnames(CEMS_mflg0_preDSC_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg0_preDSC_flat_scratch = CEMS_mflg0_preDSC_NEI_EGU_IDi
  } else{
    CEMS_mflg0_preDSC_flat_scratch = rbind(CEMS_mflg0_preDSC_flat_scratch, CEMS_mflg0_preDSC_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg0_preDSC_flat = CEMS_mflg0_preDSC_flat_scratch %>% filter(!is.na(NEI_EGU_ID))
CEMS_mflg0_DSC_flat = join(CEMS_mflg0_preDSC_flat,NEI_wsi_match0_pool, by="NEI_EGU_ID", type="left")
for (rr_cems in 1:nrow(CEMS_mflg0_DSC_flat)) {
  CEMS_mflg0_DSC_flat[rr_cems, "distance"] = distm(c(CEMS_mflg0_DSC_flat[rr_cems, "LON_NEI"], CEMS_mflg0_DSC_flat[rr_cems, "LAT_NEI"]), c(CEMS_mflg0_DSC_flat[rr_cems, "LON_CEMS"], CEMS_mflg0_DSC_flat[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
}

distance_buffer_bygrid = sqrt((4/2)**2 + (4/2)**2) #inspired by 4km grid

#subset CEMS_mflg0_DSC_flat so we have less columns and avoid error:"Can't transform a data frame with duplicate names."
CEMS_mflg0_DSC_flat = subset(CEMS_mflg0_DSC_flat, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon", "NEI_EGU_ID","distance"))

#if matched points are too far away (distance > distance_buffer_bygrid), remove them from CEMS_mflg0_DSC_flat
CEMS_mflg0_flat = CEMS_mflg0_DSC_flat %>% filter(CEMS_mflg0_DSC_flat$distance <= distance_buffer_bygrid)

#Update distance_buffer
distance_buffer = max(CEMS_mflg0_flat$distance)

# Get a subset of CEMS that are not matched after flag 0
CEMS_mflg0_leftover = CEMS %>% filter(!(CEMS$ORIS_ID %in% CEMS_mflg0_flat$ORIS_ID))
CEMS_mflg0_leftover = subset(CEMS_mflg0_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg0_flat, file = "CEMS_mflg0_flat.csv")
write.csv(CEMS_mflg0_leftover, file = "CEMS_mflg0_leftover.csv")

unassigned_CEMS_NOx_mflg0 = sum(CEMS_mflg0_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg0 = sum(CEMS_mflg0_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg0 = sum(CEMS_mflg0_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg0 = unassigned_CEMS_NOx_mflg0/CEMS_NOx_noBIO_tot * 100 #8.47%
unassigned_SO2_percentage_mflg0 = unassigned_CEMS_SO2_mflg0/CEMS_SO2_noBIO_tot * 100 #2.80%
unassigned_CO2_percentage_mflg0 = unassigned_CEMS_CO2_mflg0/CEMS_CO2_noBIO_tot * 100 #9.79%

#############################################################################################################################
#match_qual_flag = 1: matched by  ORISPL, ORISUN,      and passed distance sanity check

##########################
#prepare NEI_wsi_match1_pool array
NEI_wsi_match1_pool = NEI_wsi_match0_pool

#matching each row of CEMS_mflg0_leftover with NEI_wsi_match1_pool by  ORISPL, ORISUN
max_mflg1_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(CEMS_mflg0_leftover)) {
  ORISPL_CEMS <- CEMS_mflg0_leftover[rr_cems, "ORISPL"]
  ORISUN_CEMS <- CEMS_mflg0_leftover[rr_cems, "ORISUN"]
  match_NEI_EGU_ID = 0
  for (rr_nei in 1:nrow(NEI_wsi_match1_pool)) {
    ORISPL_NEI <- NEI_wsi_match1_pool[rr_nei, "ORISPL"]
    ORISUN_NEI <- NEI_wsi_match1_pool[rr_nei, "ORISUN"]
    if (!is.na(ORISPL_NEI)){
      if (ORISPL_CEMS == ORISPL_NEI & ORISUN_CEMS == ORISUN_NEI){
        match_NEI_EGU_ID = match_NEI_EGU_ID + 1
        #write down all the NEI NEI_EGU_ID that meet the matching criteria
        CEMS_mflg0_leftover[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match1_pool[rr_nei, "NEI_EGU_ID"]
      }
    }
  }
  if (match_NEI_EGU_ID > max_mflg1_NEI_EGU_ID){
    max_mflg1_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of CEMS_mflg0_leftover that are match flag 1 pre-DSC
CEMS_mflg1_preDSC = CEMS_mflg0_leftover %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg1_NEI_EGU_ID) subsets of CEMS_mflg1_preDSC
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg1_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg1_preDSC_NEI_EGU_IDi = subset(CEMS_mflg1_preDSC, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg1_preDSC_NEI_EGU_IDi)[colnames(CEMS_mflg1_preDSC_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg1_preDSC_flat_scratch = CEMS_mflg1_preDSC_NEI_EGU_IDi
  } else{
    CEMS_mflg1_preDSC_flat_scratch = rbind(CEMS_mflg1_preDSC_flat_scratch, CEMS_mflg1_preDSC_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg1_preDSC_flat = CEMS_mflg1_preDSC_flat_scratch %>% filter(!is.na(NEI_EGU_ID))
CEMS_mflg1_DSC_flat = join(CEMS_mflg1_preDSC_flat,NEI_wsi_match1_pool, by="NEI_EGU_ID", type="left")
for (rr_cems in 1:nrow(CEMS_mflg1_DSC_flat)) {
  CEMS_mflg1_DSC_flat[rr_cems, "distance"] = distm(c(CEMS_mflg1_DSC_flat[rr_cems, "LON_NEI"], CEMS_mflg1_DSC_flat[rr_cems, "LAT_NEI"]), c(CEMS_mflg1_DSC_flat[rr_cems, "LON_CEMS"], CEMS_mflg1_DSC_flat[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
}

#subset CEMS_mflg1_DSC_flat so we have less columns and avoid error:"Can't transform a data frame with duplicate names."
CEMS_mflg1_DSC_flat = subset(CEMS_mflg1_DSC_flat, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon", "NEI_EGU_ID","distance"))

#if matched points are too far away (distance > distance_buffer_bygrid), remove them from CEMS_mflg1_DSC_flat
CEMS_mflg1_flat = CEMS_mflg1_DSC_flat %>% filter(CEMS_mflg1_DSC_flat$distance <= distance_buffer_bygrid)

# Get a subset of CEMS_mflg0_leftover that are not matched after flag 1
CEMS_mflg1_leftover = CEMS_mflg0_leftover %>% filter(!(CEMS_mflg0_leftover$ORIS_ID %in% CEMS_mflg1_flat$ORIS_ID))
CEMS_mflg1_leftover = subset(CEMS_mflg1_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg1_flat, file = "CEMS_mflg1_flat.csv")
write.csv(CEMS_mflg1_leftover, file = "CEMS_mflg1_leftover.csv")

unassigned_CEMS_NOx_mflg1 = sum(CEMS_mflg1_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg1 = sum(CEMS_mflg1_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg1 = sum(CEMS_mflg1_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg1 = unassigned_CEMS_NOx_mflg1/CEMS_NOx_noBIO_tot * 100 #7.44%
unassigned_SO2_percentage_mflg1 = unassigned_CEMS_SO2_mflg1/CEMS_SO2_noBIO_tot * 100 #1.72%
unassigned_CO2_percentage_mflg1 = unassigned_CEMS_CO2_mflg1/CEMS_CO2_noBIO_tot * 100 #9.03%

#############################################################################################################################
#match_qual_flag = 2: matched by  ORISPL,               Fuel and passed distance sanity check

##########################
#prepare NEI_wsi_match2_pool array
NEI_wsi_match2_pool = NEI_wsi_match1_pool

#matching each row of CEMS_mflg1_leftover with NEI_wsi_match2_pool by  ORISPL, Fuel
max_mflg2_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(CEMS_mflg1_leftover)) {
  ORISPL_CEMS <- CEMS_mflg1_leftover[rr_cems, "ORISPL"]
  Fuel_CEMS <- CEMS_mflg1_leftover[rr_cems, "Fuel"]
  match_NEI_EGU_ID = 0
  for (rr_nei in 1:nrow(NEI_wsi_match2_pool)) {
    ORISPL_NEI <- NEI_wsi_match2_pool[rr_nei, "ORISPL"]
    Fuel_NEI <- NEI_wsi_match2_pool[rr_nei, "Fuel"]
    if (!is.na(ORISPL_NEI)){
      if (ORISPL_CEMS == ORISPL_NEI & Fuel_CEMS == Fuel_NEI){
        match_NEI_EGU_ID = match_NEI_EGU_ID + 1
        #write down all the NEI NEI_EGU_ID that meet the matching criteria
        CEMS_mflg1_leftover[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match2_pool[rr_nei, "NEI_EGU_ID"]
      }
    }
  }
  if (match_NEI_EGU_ID > max_mflg2_NEI_EGU_ID){
    max_mflg2_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of CEMS_mflg1_leftover that are match flag 2 pre-DSC
CEMS_mflg2_preDSC = CEMS_mflg1_leftover %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg2_NEI_EGU_ID) subsets of CEMS_mflg2_preDSC
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg2_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg2_preDSC_NEI_EGU_IDi = subset(CEMS_mflg2_preDSC, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg2_preDSC_NEI_EGU_IDi)[colnames(CEMS_mflg2_preDSC_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg2_preDSC_flat_scratch = CEMS_mflg2_preDSC_NEI_EGU_IDi
  } else{
    CEMS_mflg2_preDSC_flat_scratch = rbind(CEMS_mflg2_preDSC_flat_scratch, CEMS_mflg2_preDSC_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg2_preDSC_flat = CEMS_mflg2_preDSC_flat_scratch %>% filter(!is.na(NEI_EGU_ID))
CEMS_mflg2_DSC_flat = join(CEMS_mflg2_preDSC_flat,NEI_wsi_match2_pool, by="NEI_EGU_ID", type="left")
for (rr_cems in 1:nrow(CEMS_mflg2_DSC_flat)) {
  CEMS_mflg2_DSC_flat[rr_cems, "distance"] = distm(c(CEMS_mflg2_DSC_flat[rr_cems, "LON_NEI"], CEMS_mflg2_DSC_flat[rr_cems, "LAT_NEI"]), c(CEMS_mflg2_DSC_flat[rr_cems, "LON_CEMS"], CEMS_mflg2_DSC_flat[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
}

#subset CEMS_mflg2_DSC_flat so we have less columns and avoid error:"Can't transform a data frame with duplicate names."
CEMS_mflg2_DSC_flat = subset(CEMS_mflg2_DSC_flat, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon", "NEI_EGU_ID","distance"))

#if matched points are too far away (distance > distance_buffer_bygrid), remove them from CEMS_mflg2_DSC_flat
CEMS_mflg2_flat = CEMS_mflg2_DSC_flat %>% filter(CEMS_mflg2_DSC_flat$distance <= distance_buffer_bygrid)

# Get a subset of CEMS_mflg1_leftover that are not matched after flag 2
CEMS_mflg2_leftover = CEMS_mflg1_leftover %>% filter(!(CEMS_mflg1_leftover$ORIS_ID %in% CEMS_mflg2_flat$ORIS_ID))
CEMS_mflg2_leftover = subset(CEMS_mflg2_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg2_flat, file = "CEMS_mflg2_flat.csv")
write.csv(CEMS_mflg2_leftover, file = "CEMS_mflg2_leftover.csv")

unassigned_CEMS_NOx_mflg2 = sum(CEMS_mflg2_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg2 = sum(CEMS_mflg2_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg2 = sum(CEMS_mflg2_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg2 = unassigned_CEMS_NOx_mflg2/CEMS_NOx_noBIO_tot * 100 #6.80%
unassigned_SO2_percentage_mflg2 = unassigned_CEMS_SO2_mflg2/CEMS_SO2_noBIO_tot * 100 #1.45%
unassigned_CO2_percentage_mflg2 = unassigned_CEMS_CO2_mflg2/CEMS_CO2_noBIO_tot * 100 #8.30%

#############################################################################################################################
#match_qual_flag = 3: matched by  ORISPL,                    and passed distance sanity check

##########################
#prepare NEI_wsi_match3_pool array
NEI_wsi_match3_pool = NEI_wsi_match2_pool

#matching each row of CEMS_mflg2_leftover with NEI_wsi_match3_pool by  ORISPL, Fuel
max_mflg3_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(CEMS_mflg2_leftover)) {
  ORISPL_CEMS <- CEMS_mflg2_leftover[rr_cems, "ORISPL"]
  match_NEI_EGU_ID = 0
  for (rr_nei in 1:nrow(NEI_wsi_match3_pool)) {
    ORISPL_NEI <- NEI_wsi_match3_pool[rr_nei, "ORISPL"]
    if (!is.na(ORISPL_NEI)){
      if (ORISPL_CEMS == ORISPL_NEI){
        match_NEI_EGU_ID = match_NEI_EGU_ID + 1
        #write down all the NEI NEI_EGU_ID that meet the matching criteria
        CEMS_mflg2_leftover[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match3_pool[rr_nei, "NEI_EGU_ID"]
      }
    }
  }
  if (match_NEI_EGU_ID > max_mflg3_NEI_EGU_ID){
    max_mflg3_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Distance Sanity Check (DSC)

#Get a subset of CEMS_mflg2_leftover that are match flag 3 pre-DSC
CEMS_mflg3_preDSC = CEMS_mflg2_leftover %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg3_NEI_EGU_ID) subsets of CEMS_mflg3_preDSC
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg3_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg3_preDSC_NEI_EGU_IDi = subset(CEMS_mflg3_preDSC, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg3_preDSC_NEI_EGU_IDi)[colnames(CEMS_mflg3_preDSC_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg3_preDSC_flat_scratch = CEMS_mflg3_preDSC_NEI_EGU_IDi
  } else{
    CEMS_mflg3_preDSC_flat_scratch = rbind(CEMS_mflg3_preDSC_flat_scratch, CEMS_mflg3_preDSC_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg3_preDSC_flat = CEMS_mflg3_preDSC_flat_scratch %>% filter(!is.na(NEI_EGU_ID))
CEMS_mflg3_DSC_flat = join(CEMS_mflg3_preDSC_flat,NEI_wsi_match3_pool, by="NEI_EGU_ID", type="left")
for (rr_cems in 1:nrow(CEMS_mflg3_DSC_flat)) {
  CEMS_mflg3_DSC_flat[rr_cems, "distance"] = distm(c(CEMS_mflg3_DSC_flat[rr_cems, "LON_NEI"], CEMS_mflg3_DSC_flat[rr_cems, "LAT_NEI"]), c(CEMS_mflg3_DSC_flat[rr_cems, "LON_CEMS"], CEMS_mflg3_DSC_flat[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
}

#subset CEMS_mflg3_DSC_flat so we have less columns and avoid error:"Can't transform a data frame with duplicate names."
CEMS_mflg3_DSC_flat = subset(CEMS_mflg3_DSC_flat, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon", "NEI_EGU_ID","distance"))

#if matched points are too far away (distance > distance_buffer_bygrid), remove them from CEMS_mflg3_DSC_flat
CEMS_mflg3_flat = CEMS_mflg3_DSC_flat %>% filter(CEMS_mflg3_DSC_flat$distance <= distance_buffer_bygrid)

# Get a subset of CEMS_mflg2_leftover that are not matched after flag 3
CEMS_mflg3_leftover = CEMS_mflg2_leftover %>% filter(!(CEMS_mflg2_leftover$ORIS_ID %in% CEMS_mflg3_flat$ORIS_ID))
CEMS_mflg3_leftover = subset(CEMS_mflg3_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg3_flat, file = "CEMS_mflg3_flat.csv")
write.csv(CEMS_mflg3_leftover, file = "CEMS_mflg3_leftover.csv")

unassigned_CEMS_NOx_mflg3 = sum(CEMS_mflg3_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg3 = sum(CEMS_mflg3_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg3 = sum(CEMS_mflg3_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg3 = unassigned_CEMS_NOx_mflg3/CEMS_NOx_noBIO_tot * 100 #5.95%
unassigned_SO2_percentage_mflg3 = unassigned_CEMS_SO2_mflg3/CEMS_SO2_noBIO_tot * 100 #1.20%
unassigned_CO2_percentage_mflg3 = unassigned_CEMS_CO2_mflg3/CEMS_CO2_noBIO_tot * 100 #7.80%

#############################################################################################################################
#match_qual_flag = 4: matched by                        Fuel and        distance (mutually closest)

##########################
#prepare NEI_wsi_match4_pool array
NEI_wsi_match4_pool = NEI_wsi %>% filter(!is.na(LON_NEI) & !is.na(LAT_NEI) & !is.na(Fuel))

#build distance matrix
dis_matrix <- array(c(0), dim = c(nrow(CEMS_mflg3_leftover), nrow(NEI_wsi_match4_pool)))
for (rr_cems in 1:nrow(dis_matrix)) {
  for (cc_nei in 1:ncol(dis_matrix)) {
    dis_matrix[rr_cems,cc_nei] = distm(c(NEI_wsi_match4_pool[cc_nei, "LON_NEI"], NEI_wsi_match4_pool[cc_nei, "LAT_NEI"]), c(CEMS_mflg3_leftover[rr_cems, "LON_CEMS"], CEMS_mflg3_leftover[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
  }
}

##########################
#perform lat lon fuel matching
max_mflg4_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(dis_matrix)) {
  #find the nearest NEI to current CEMS
  Fuel_CEMS <- CEMS_mflg3_leftover[rr_cems, "Fuel"]
  closest_NEI = which(dis_matrix[rr_cems,] == min(dis_matrix[rr_cems,]))
  num_closest_NEI = length(closest_NEI)
  match_NEI_EGU_ID = 0
  for (ii in 1:num_closest_NEI) {
    cc_nei = closest_NEI[ii]
    closest_CEMS = which(dis_matrix[,cc_nei] == min(dis_matrix[,cc_nei]))
    Fuel_NEI <- NEI_wsi_match4_pool[cc_nei,"Fuel"]
    if (rr_cems %in% closest_CEMS & dis_matrix[rr_cems,cc_nei] < distance_buffer & Fuel_CEMS == Fuel_NEI ){
      match_NEI_EGU_ID = match_NEI_EGU_ID + 1
      #write down all the NEI NEI_EGU_ID that meet the matching criteria
      CEMS_mflg3_leftover[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match4_pool[cc_nei, "NEI_EGU_ID"]
    }
  }
  if (match_NEI_EGU_ID > max_mflg4_NEI_EGU_ID){
    max_mflg4_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Get a subset of CEMS_mflg3_leftover that are match flag 4
CEMS_mflg4 = CEMS_mflg3_leftover %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg4_NEI_EGU_ID) subsets of CEMS_mflg4
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg4_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg4_NEI_EGU_IDi = subset(CEMS_mflg4, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg4_NEI_EGU_IDi)[colnames(CEMS_mflg4_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg4_flat_scratch = CEMS_mflg4_NEI_EGU_IDi
  } else{
    CEMS_mflg4_flat_scratch = rbind(CEMS_mflg4_flat_scratch, CEMS_mflg4_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg4_flat = CEMS_mflg4_flat_scratch %>% filter(!is.na(NEI_EGU_ID))

# Get a subset of CEMS_mflg3_leftover that are not matched after flag 4
CEMS_mflg4_leftover = CEMS_mflg3_leftover %>% filter(!(CEMS_mflg3_leftover$ORIS_ID %in% CEMS_mflg4_flat$ORIS_ID))
CEMS_mflg4_leftover = subset(CEMS_mflg4_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg4_flat, file = "CEMS_mflg4_flat.csv")
write.csv(CEMS_mflg4_leftover, file = "CEMS_mflg4_leftover.csv")

unassigned_CEMS_NOx_mflg4 = sum(CEMS_mflg4_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg4 = sum(CEMS_mflg4_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg4 = sum(CEMS_mflg4_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg4 = unassigned_CEMS_NOx_mflg4/CEMS_NOx_noBIO_tot * 100 #3.36%
unassigned_SO2_percentage_mflg4 = unassigned_CEMS_SO2_mflg4/CEMS_SO2_noBIO_tot * 100 #0.53%
unassigned_CO2_percentage_mflg4 = unassigned_CEMS_CO2_mflg4/CEMS_CO2_noBIO_tot * 100 #3.66%

#############################################################################################################################
#match_qual_flag = 5: matched by                                        distance (mutually closest)

##########################
#prepare NEI_wsi_match5_pool array
NEI_wsi_match5_pool = NEI_wsi_match4_pool

#build distance matrix
dis_matrix <- array(c(0), dim = c(nrow(CEMS_mflg4_leftover), nrow(NEI_wsi_match5_pool)))
for (rr_cems in 1:nrow(dis_matrix)) {
  for (cc_nei in 1:ncol(dis_matrix)) {
    dis_matrix[rr_cems,cc_nei] = distm(c(NEI_wsi_match5_pool[cc_nei, "LON_NEI"], NEI_wsi_match5_pool[cc_nei, "LAT_NEI"]), c(CEMS_mflg4_leftover[rr_cems, "LON_CEMS"], CEMS_mflg4_leftover[rr_cems, "LAT_CEMS"]), fun = distHaversine)/1000 #distance unit: km
  }
}

##########################
#perform lat lon matching
max_mflg5_NEI_EGU_ID = 0
for (rr_cems in 1:nrow(dis_matrix)) {
  #find the nearest NEI to current CEMS
  closest_NEI = which(dis_matrix[rr_cems,] == min(dis_matrix[rr_cems,]))
  num_closest_NEI = length(closest_NEI)
  match_NEI_EGU_ID = 0
  for (ii in 1:num_closest_NEI) {
    cc_nei = closest_NEI[ii]
    closest_CEMS = which(dis_matrix[,cc_nei] == min(dis_matrix[,cc_nei]))
    if (rr_cems %in% closest_CEMS & dis_matrix[rr_cems,cc_nei] < distance_buffer){
      match_NEI_EGU_ID = match_NEI_EGU_ID + 1
      #write down all the NEI NEI_EGU_ID that meet the matching criteria
      CEMS_mflg4_leftover[rr_cems, sprintf("NEI_EGU_ID_%d", match_NEI_EGU_ID)] = NEI_wsi_match5_pool[cc_nei, "NEI_EGU_ID"]
    }
  }
  if (match_NEI_EGU_ID > max_mflg5_NEI_EGU_ID){
    max_mflg5_NEI_EGU_ID = match_NEI_EGU_ID
  }
}

##########################
#Get a subset of CEMS_mflg4_leftover that are match flag 5
CEMS_mflg5 = CEMS_mflg4_leftover %>% filter(NEI_EGU_ID_1 != 'NA')

#reorgainize the table to have each NEI_EGU_ID in a private row
#Isolating (max_mflg5_NEI_EGU_ID) subsets of CEMS_mflg5
#each subset only have one NEI_EGU_ID_* and rename it to NEI_EGU_ID
for(i in 1:max_mflg5_NEI_EGU_ID) { 
  NEI_EGU_ID_i <- paste("NEI_EGU_ID_", i, sep = "")
  CEMS_mflg5_NEI_EGU_IDi = subset(CEMS_mflg5, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon",NEI_EGU_ID_i))
  colnames(CEMS_mflg5_NEI_EGU_IDi)[colnames(CEMS_mflg5_NEI_EGU_IDi) == NEI_EGU_ID_i] ="NEI_EGU_ID"
  #stack the subset copies
  if (i==1){
    CEMS_mflg5_flat_scratch = CEMS_mflg5_NEI_EGU_IDi
  } else{
    CEMS_mflg5_flat_scratch = rbind(CEMS_mflg5_flat_scratch, CEMS_mflg5_NEI_EGU_IDi)
  }
}

#remove rows with NEI_EGU_ID == NA #note:filter(!is.na(NEI_EGU_ID)) is the same as filter(NEI_EGU_ID != 'NA')
CEMS_mflg5_flat = CEMS_mflg5_flat_scratch %>% filter(!is.na(NEI_EGU_ID))

# Get a subset of CEMS_mflg4_leftover that are not matched after flag 5
CEMS_mflg5_leftover = CEMS_mflg4_leftover %>% filter(!(CEMS_mflg4_leftover$ORIS_ID %in% CEMS_mflg5_flat$ORIS_ID))
CEMS_mflg5_leftover = subset(CEMS_mflg5_leftover, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon"))

write.csv(CEMS_mflg5_flat, file = "CEMS_mflg5_flat.csv")
write.csv(CEMS_mflg5_leftover, file = "CEMS_mflg5_leftover.csv")

unassigned_CEMS_NOx_mflg5 = sum(CEMS_mflg5_leftover$Annual_NOx_Emis_MetricTon)
unassigned_CEMS_SO2_mflg5 = sum(CEMS_mflg5_leftover$Annual_SO2_Emis_MetricTon)
unassigned_CEMS_CO2_mflg5 = sum(CEMS_mflg5_leftover$Annual_CO2_Emis_MetricTon)

unassigned_NOx_percentage_mflg5 = unassigned_CEMS_NOx_mflg5/CEMS_NOx_noBIO_tot * 100 #2.01%
unassigned_SO2_percentage_mflg5 = unassigned_CEMS_SO2_mflg5/CEMS_SO2_noBIO_tot * 100 #0.03%
unassigned_CO2_percentage_mflg5 = unassigned_CEMS_CO2_mflg5/CEMS_CO2_noBIO_tot * 100 #1.59%

##################################################################################################################################
# summarize and combine CEMS that are matched with NEI
#add distance column to mflg4, 5 as NA, because we do not need the distance information and will remove this column after the combining of matched results from the 5 flags
CEMS_mflg4_flat$distance = NA
CEMS_mflg5_flat$distance = NA

CEMS_NEI_match_flat = rbind(CEMS_mflg0_flat,CEMS_mflg1_flat,CEMS_mflg2_flat,CEMS_mflg3_flat,CEMS_mflg4_flat,CEMS_mflg5_flat)
CEMS_NEI_match_flat = subset(CEMS_NEI_match_flat, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS", "LAT_CEMS", "Fuel", "Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu", "Annual_NOx_Emis_MetricTon", "Annual_SO2_Emis_MetricTon", "Annual_CO2_Emis_MetricTon","NEI_EGU_ID"))
write.csv(CEMS_NEI_match_flat, file = "CEMS_NEI_match_flat_202108.csv")

#Get a subset of CEMS that are not matched with NEIs by any method
CEMS_NEI_notmatch = CEMS_mflg5_leftover
write.csv(CEMS_NEI_notmatch, file = "CEMS_NEI_notmatch.csv")

##################################################################################################################################
#join CEMS_NEI_match_flat with NEI_wsi by NEI_EGU_ID
#to make a table that join CEMS fuel and capacity with NEI stack parameters and other info 
#to prepare for artificial stack parameter database for the CEMS points that do not find matching NEI by any method
colnames(NEI_wsi)[colnames(NEI_wsi) == "Fuel"] ="Fuel_NEI"
colnames(NEI_wsi)[colnames(NEI_wsi) == "ORISPL"] ="ORISPL_NEI"
colnames(NEI_wsi)[colnames(NEI_wsi) == "ORISUN"] ="ORISUN_NEI"

CEMS_NEI_stack_info = join(CEMS_NEI_match_flat,NEI_wsi, by="NEI_EGU_ID", type="left")
CEMS_NEI_stack_info = CEMS_NEI_stack_info %>% filter(Annual_Operating_Time_hr != 0)
CEMS_NEI_stack_info$Hrly_Heat_Input = CEMS_NEI_stack_info$Annual_Heat_Input_MMBtu / CEMS_NEI_stack_info$Annual_Operating_Time_hr

#simplify the output
CEMS_NEI_stack_info_subset = subset(CEMS_NEI_stack_info, select=c("ORIS_ID", "NEI_EGU_ID", "Fuel", "Hrly_Heat_Input", "STKHGT", "STKDIAM", "STKTEMP", "STKVEL", "STKFLOW", "CO_NEI"))
#CO_NEI in short tons

#in some cases, one CEMS can match with more than one (a set of) NEIs
#in this case, distribute Hrly_Heat_Input to all these matching NEIs (same for emissions)
#according to their relative CO emissions contribution to the set's total
#If it is one to one match, CO_NEI = NA is not a issue at all, can use state-level co-emitted species/CO2 ratio to get CO and other species emissions
#if it is one CEMS to multiple NEI match, exclude the NEIs that have CO_NEI = NA from the distribution and analysis from now on.

##########################################################
#loop over ORIS_ID
CEMS_NEI_matched_ORIS_ID = (CEMS_NEI_stack_info_subset %>% distinct(ORIS_ID, .keep_all=TRUE))$ORIS_ID

First_CEMS = TRUE
for (uniq_ORIS_ID in CEMS_NEI_matched_ORIS_ID) {
  #create ORIS_ID-specific subsets
  ORIS_ID_set = CEMS_NEI_stack_info_subset %>% filter(ORIS_ID == uniq_ORIS_ID)
  #count number of rows, matched NEIs for the current ORIS_ID
  match_NEI_count = nrow(ORIS_ID_set)
  #if multiple row for current ORIS_ID, remove the rows with CO_NEI = NA, and distribute Hrly_Heat_Input into the rest NEIs according their CO_NEI fractional contribution to the set's total
  if (match_NEI_count > 1){
    ORIS_ID_set = ORIS_ID_set %>% filter(!is.na(CO_NEI))
    set_total_CO = sum(ORIS_ID_set$CO_NEI)
    ORIS_ID_set$CO_NEI_frac = ORIS_ID_set$CO_NEI/set_total_CO
    ORIS_ID_set$HrlyHeatInput_toNEI = ORIS_ID_set$Hrly_Heat_Input*ORIS_ID_set$CO_NEI_frac
  } else{ #if only one row for current ORIS_ID, do not need to deal with CO_NEI = NA issue, and leave as it is
    ORIS_ID_set$CO_NEI_frac = 1.0
    ORIS_ID_set$HrlyHeatInput_toNEI = ORIS_ID_set$Hrly_Heat_Input*ORIS_ID_set$CO_NEI_frac
  }
  #stack ORIS_ID_set
  if (First_CEMS){
    CEMS_NEI_stack_info_refine = ORIS_ID_set
    First_CEMS = FALSE
  } else{
    CEMS_NEI_stack_info_refine = rbind(CEMS_NEI_stack_info_refine, ORIS_ID_set)
  }
}

#only keep stack parameters and hourly capacity indicator
CEMS_NEI_stack_info_refine = subset(CEMS_NEI_stack_info_refine, select=c("ORIS_ID","NEI_EGU_ID","Fuel","HrlyHeatInput_toNEI", "STKHGT", "STKDIAM", "STKTEMP", "STKVEL", "STKFLOW"))

#still need to sum hourly heat input by unique NEI_EGU_ID to get the actual capacity info of each unique NEI point
#because it also possible that one NEI associates with multiple CEMS
#loop over unique NEI_EGU_ID
NEI_EGU_ID_unique = (CEMS_NEI_stack_info_refine %>% distinct(NEI_EGU_ID, .keep_all=TRUE))$NEI_EGU_ID

First_NEI = TRUE
for (uniq_NEI_EGU_ID in NEI_EGU_ID_unique) {
  #create NEI_EGU_ID-specific subsets
  NEI_EGU_ID_set = CEMS_NEI_stack_info_refine %>% filter(NEI_EGU_ID == uniq_NEI_EGU_ID)
  NEI_EGU_ID_set$HrlyHeatInput = sum(NEI_EGU_ID_set$HrlyHeatInput_toNEI)
  NEI_EGU_ID_set = subset(NEI_EGU_ID_set, select=c("NEI_EGU_ID","Fuel","HrlyHeatInput", "STKHGT", "STKDIAM", "STKTEMP", "STKVEL", "STKFLOW"))
  NEI_EGU_ID_fuel_capa_stakinfo = NEI_EGU_ID_set %>% distinct(NEI_EGU_ID, .keep_all=TRUE)
  #stack NEI_EGU_ID_set
  if (First_NEI){
    NEI_fuel_capa_stackinfo = NEI_EGU_ID_fuel_capa_stakinfo
    First_NEI = FALSE
  } else{
    NEI_fuel_capa_stackinfo = rbind(NEI_fuel_capa_stackinfo, NEI_EGU_ID_fuel_capa_stakinfo)
  }  
}

#hourly CO2 emissions and hourly heat input are highly correlated according to V1 script, do not repeat here again
#hourly CO2 has more zeros than hourly heat input, to get more useful capacity-stack info data
#Use hourly heat input as indicators of capacity with NaNs removed

NEI_fuel_capa_stackinfo = NEI_fuel_capa_stackinfo %>% filter(!is.na(HrlyHeatInput))

##################################################################################################
#For a CEMS EGU that is not matched yet with the NEI:
#First, based on its fuel type, select which fuel-specific hourly heat input and stack info group it belongs to
#Then, join this one CEMS EGU with the fuel-specific capacity-stack info dataset, and do k-mean analysis (https://www.datacamp.com/tutorial/k-means-clustering-r), based on its hourly heat input decide which capacity cluster it belongs, hourly heat input can be zero
#Lastly, pick stack parameters (STKHGT, STKTEMP, STKFLOW and STKDIAM) at random noise within a reasonable range around the same percentile as the CEMS hourly heat input locates in the current capacity cluster
#and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)

##################################################################################################
#So, now let's prepare fuel-specific capacity-stack info datasets

#separate the NEI_fuel_capa_stackinfo by fuel type
#read unique fuel type in fuel column
Fuel_type = unique(NEI_fuel_capa_stackinfo[c("Fuel")])
capa_stackinfo_NG = NEI_fuel_capa_stackinfo %>% filter(Fuel == "EGU_NG")
capa_stackinfo_Coal = NEI_fuel_capa_stackinfo %>% filter(Fuel == "EGU_Coal")
capa_stackinfo_Oil = NEI_fuel_capa_stackinfo %>% filter(Fuel == "EGU_Oil")

#Separate HrlyHeatInput = 0 from the non-zeros, so when a CEMS point has hourly heat input = 0 or NA, I can choose stack parameters from this data set
HI0_stackinfo_NG = capa_stackinfo_NG%>% filter(HrlyHeatInput == 0)
HI0_stackinfo_Coal = capa_stackinfo_Coal%>% filter(HrlyHeatInput == 0)
HI0_stackinfo_Oil = capa_stackinfo_Oil%>% filter(HrlyHeatInput == 0)

HInon0_stackinfo_NG = capa_stackinfo_NG%>% filter(HrlyHeatInput != 0)
HInon0_stackinfo_Coal = capa_stackinfo_Coal%>% filter(HrlyHeatInput != 0)
HInon0_stackinfo_Oil = capa_stackinfo_Oil%>% filter(HrlyHeatInput != 0)

##################################################################################################
#Study the distributions of capacities using the HInon0 data sets, to determine how many clusters to use

#NG
##################################################################################################
#plot the distribution of hourly heat input to determine how many clusters to divide the data into
dist_HI_NG <- hist(HInon0_stackinfo_NG$HrlyHeatInput, plot = FALSE)
plot(dist_HI_NG, xlab = "hourly heat input", ylab = "Counts", main = "fuel type: NG", col = "grey")
describe(HInon0_stackinfo_NG$HrlyHeatInput)
#decided to use 5 clusters
num_cluster_NG <- 5

#Coal
##################################################################################################
#plot the distribution of hourly heat input to determine how many clusters to divide the data into
dist_HI_Coal <- hist(HInon0_stackinfo_Coal$HrlyHeatInput, plot = FALSE)
plot(dist_HI_Coal, xlab = "hourly heat input", ylab = "Counts", main = "fuel type: Coal", col = "grey")
describe(HInon0_stackinfo_Coal$HrlyHeatInput)
#decided to use 5 clusters
num_cluster_Coal <- 5

#Oil
##################################################################################################
#plot the distribution of hourly heat input to determine how many clusters to divide the data into
dist_HI_Oil <- hist(HInon0_stackinfo_Oil$HrlyHeatInput, plot = FALSE)
plot(dist_HI_Oil, xlab = "hourly heat input", ylab = "Counts", main = "fuel type: Oil", col = "grey")
describe(HInon0_stackinfo_Oil$HrlyHeatInput)
#decided to use 5 clusters
num_cluster_Oil <- 5

##################################################################################################
#get CEMS EGU hourly heat input
#in CEMS_NEI_notmatch there are rows with Annual_Operating_Time_hr = 0, and the Annual_Heat_Input_MMBtu for all these rows are = 0
#But I am not supposed to get rid of these rows, because I want to get artificial stack parameters for every single CEMS point so that we do not miss any CEMS CO2 emissions
#So, just use 0/0 = NA, and change NA to 0 as hourly heat input
CEMS_NEI_notmatch$HrlyHeatInput = CEMS_NEI_notmatch$Annual_Heat_Input_MMBtu / CEMS_NEI_notmatch$Annual_Operating_Time_hr
#change NaN to 0
CEMS_NEI_notmatch$HrlyHeatInput[is.nan(CEMS_NEI_notmatch$HrlyHeatInput)] <- 0

#simplify CEMS_NEI_notmatch
CEMS_NEI_notmatch_subset = subset(CEMS_NEI_notmatch, select=c("ORIS_ID","Fuel","HrlyHeatInput"))

##################################################################################################
#For a CEMS EGU that is not matched yet with the NEI:
#First, based on its fuel type, select which fuel-specific hourly heat input and stack info group it belongs to

#separate the CEMS_NEI_notmatch_subset by fuel type
#read unique fuel type in fuel column
Fuel_type = unique(CEMS_NEI_notmatch_subset[c("Fuel")])
CEMS_NEI_notmatch_NG = CEMS_NEI_notmatch_subset %>% filter(Fuel == "EGU_NG")
CEMS_NEI_notmatch_Coal = CEMS_NEI_notmatch_subset %>% filter(Fuel == "EGU_Coal")
CEMS_NEI_notmatch_Oil = CEMS_NEI_notmatch_subset %>% filter(Fuel == "EGU_Oil")

##################################################################################################
#NG
for (rrr in 1:nrow(CEMS_NEI_notmatch_NG)) {
  HIrrr = CEMS_NEI_notmatch_NG$HrlyHeatInput[rrr]
  
  #Then, based on its hourly heat input decide which capacity cluster it belongs, note: hourly heat input can be zero
  if (HIrrr > 0){
    #append this HIrrr to the end of HInon0_stackinfo_NG
    HI_vector_rrr <- append(HInon0_stackinfo_NG$HrlyHeatInput, HIrrr)
    #To create random numbers which can be reproduced. 
    #It helps in creating same random numbers each time a random function is called. 
    #This helps in creating repeatable data sets for a analysis.
    set.seed(2)
    clusters_HI_NG_rrr <- kmeans(HI_vector_rrr, centers = num_cluster_NG, nstart = 25)
    #Save the cluster number of HIrrr
    HI_level_fulllist <- as.factor(clusters_HI_NG_rrr$cluster)
    HInon0_stackinfo_NG$HI_level <- HI_level_fulllist[1:length(HInon0_stackinfo_NG$HrlyHeatInput)]
    which_cluster <- as.numeric(HI_level_fulllist[(length(HInon0_stackinfo_NG$HrlyHeatInput)+1)])
    HInon0_stackinfo_NG_HI_lvli = HInon0_stackinfo_NG %>% filter(HI_level == which_cluster)
    #sort small to large (ascending)
    HInon0_stackinfo_NG_HI_lvli_sorted <- HInon0_stackinfo_NG_HI_lvli[order(HInon0_stackinfo_NG_HI_lvli$"HrlyHeatInput"),]
    #Lastly, pick stack parameters (STKHGT, STKTEMP, STKFLOW and STKDIAM) at random noise within a reasonable range around the hourly heat input percentile in the current capacity cluster
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)

    #get the percentile HIrrr locate for this cluster
    pct = ecdf(HInon0_stackinfo_NG_HI_lvli_sorted$HrlyHeatInput)(HIrrr) #fractional
    pct_w_noise = jitter(pct, factor=1, amount = 0.2*pct) #could be <0 or >1
    #limit pct_w_noise within 0~1
    pct_w_noise = pmax(pmin(pct_w_noise, 1.0), 0)
    print("pct_w_noise")
    print(pct_w_noise)
    
    select_row_index = round((pct_w_noise * nrow(HInon0_stackinfo_NG_HI_lvli_sorted)),digit=0) 
    #limit select_row_index >=1
    select_row_index = pmax(select_row_index, 1)
    CEMS_NEI_notmatch_NG[rrr, "STKHGT" ] = HInon0_stackinfo_NG_HI_lvli_sorted[select_row_index, "STKHGT"]
    CEMS_NEI_notmatch_NG[rrr, "STKTEMP"] = HInon0_stackinfo_NG_HI_lvli_sorted[select_row_index, "STKTEMP"]
    CEMS_NEI_notmatch_NG[rrr, "STKFLOW"] = HInon0_stackinfo_NG_HI_lvli_sorted[select_row_index, "STKFLOW"]
    CEMS_NEI_notmatch_NG[rrr, "STKDIAM"] = HInon0_stackinfo_NG_HI_lvli_sorted[select_row_index, "STKDIAM"]
    CEMS_NEI_notmatch_NG[rrr, "STKVEL" ] = CEMS_NEI_notmatch_NG[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_NG[rrr, "STKDIAM"]/2)^2)
  }

  if (HIrrr == 0){
    #heat input = 0 is a special case, just use average values from other 0 heat input cases with the same fuel type
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)
    CEMS_NEI_notmatch_NG[rrr, "STKHGT" ] = mean(HI0_stackinfo_NG$STKHGT)
    CEMS_NEI_notmatch_NG[rrr, "STKTEMP"] = mean(HI0_stackinfo_NG$STKTEMP)
    CEMS_NEI_notmatch_NG[rrr, "STKFLOW"] = mean(HI0_stackinfo_NG$STKFLOW)
    CEMS_NEI_notmatch_NG[rrr, "STKDIAM"] = mean(HI0_stackinfo_NG$STKDIAM)
    CEMS_NEI_notmatch_NG[rrr, "STKVEL" ] = CEMS_NEI_notmatch_NG[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_NG[rrr, "STKDIAM"]/2)^2)
  }
}

##################################################################################################
#Coal
for (rrr in 1:nrow(CEMS_NEI_notmatch_Coal)) {
  HIrrr = CEMS_NEI_notmatch_Coal$HrlyHeatInput[rrr]
  
  #Then, based on its hourly heat input decide which capacity cluster it beloCoals, note: hourly heat input can be zero
  if (HIrrr > 0){
    #append this HIrrr to the end of HInon0_stackinfo_Coal
    HI_vector_rrr <- append(HInon0_stackinfo_Coal$HrlyHeatInput, HIrrr)
    #To create random numbers which can be reproduced. 
    #It helps in creatiCoal same random numbers each time a random function is called. 
    #This helps in creatiCoal repeatable data sets for a analysis.
    set.seed(2)
    clusters_HI_Coal_rrr <- kmeans(HI_vector_rrr, centers = num_cluster_Coal, nstart = 25)
    #Save the cluster number of HIrrr
    HI_level_fulllist <- as.factor(clusters_HI_Coal_rrr$cluster)
    HInon0_stackinfo_Coal$HI_level <- HI_level_fulllist[1:length(HInon0_stackinfo_Coal$HrlyHeatInput)]
    which_cluster <- as.numeric(HI_level_fulllist[(length(HInon0_stackinfo_Coal$HrlyHeatInput)+1)])
    HInon0_stackinfo_Coal_HI_lvli = HInon0_stackinfo_Coal %>% filter(HI_level == which_cluster)
    #sort small to large (ascendiCoal)
    HInon0_stackinfo_Coal_HI_lvli_sorted <- HInon0_stackinfo_Coal_HI_lvli[order(HInon0_stackinfo_Coal_HI_lvli$"HrlyHeatInput"),]
    #Lastly, pick stack parameters (STKHGT, STKTEMP, STKFLOW and STKDIAM) at random noise within a reasonable raCoale around the hourly heat input percentile in the current capacity cluster
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)

    #get the percentile HIrrr locate for this cluster
    pct = ecdf(HInon0_stackinfo_Coal_HI_lvli_sorted$HrlyHeatInput)(HIrrr) #fractional
    pct_w_noise = jitter(pct, factor=1, amount = 0.2*pct) #could be <0 or >1
    #limit pct_w_noise within 0~1
    pct_w_noise = pmax(pmin(pct_w_noise, 1.0), 0)
    print("pct_w_noise")
    print(pct_w_noise)
    
    select_row_index = round((pct_w_noise * nrow(HInon0_stackinfo_Coal_HI_lvli_sorted)),digit=0) 
    #limit select_row_index >=1
    select_row_index = pmax(select_row_index, 1)
    CEMS_NEI_notmatch_Coal[rrr, "STKHGT" ] = HInon0_stackinfo_Coal_HI_lvli_sorted[select_row_index, "STKHGT"]
    CEMS_NEI_notmatch_Coal[rrr, "STKTEMP"] = HInon0_stackinfo_Coal_HI_lvli_sorted[select_row_index, "STKTEMP"]
    CEMS_NEI_notmatch_Coal[rrr, "STKFLOW"] = HInon0_stackinfo_Coal_HI_lvli_sorted[select_row_index, "STKFLOW"]
    CEMS_NEI_notmatch_Coal[rrr, "STKDIAM"] = HInon0_stackinfo_Coal_HI_lvli_sorted[select_row_index, "STKDIAM"]
    CEMS_NEI_notmatch_Coal[rrr, "STKVEL" ] = CEMS_NEI_notmatch_Coal[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_Coal[rrr, "STKDIAM"]/2)^2)
  }

  #Then, based on its hourly heat input decide which capacity cluster it beloCoals, note: hourly heat input can be zero
  if (HIrrr == 0){
    #heat input = 0 is a special case, just use average values from other 0 heat input cases with the same fuel type
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)
    CEMS_NEI_notmatch_Coal[rrr, "STKHGT" ] = mean(HI0_stackinfo_Coal$STKHGT)
    CEMS_NEI_notmatch_Coal[rrr, "STKTEMP"] = mean(HI0_stackinfo_Coal$STKTEMP)
    CEMS_NEI_notmatch_Coal[rrr, "STKFLOW"] = mean(HI0_stackinfo_Coal$STKFLOW)
    CEMS_NEI_notmatch_Coal[rrr, "STKDIAM"] = mean(HI0_stackinfo_Coal$STKDIAM)
    CEMS_NEI_notmatch_Coal[rrr, "STKVEL" ] = CEMS_NEI_notmatch_Coal[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_Coal[rrr, "STKDIAM"]/2)^2)
  }
}

##################################################################################################
#Oil
for (rrr in 1:nrow(CEMS_NEI_notmatch_Oil)) {
  HIrrr = CEMS_NEI_notmatch_Oil$HrlyHeatInput[rrr]
  
  #Then, based on its hourly heat input decide which capacity cluster it beloOils, note: hourly heat input can be zero
  if (HIrrr > 0){
    #append this HIrrr to the end of HInon0_stackinfo_Oil
    HI_vector_rrr <- append(HInon0_stackinfo_Oil$HrlyHeatInput, HIrrr)
    #To create random numbers which can be reproduced. 
    #It helps in creatiOil same random numbers each time a random function is called. 
    #This helps in creatiOil repeatable data sets for a analysis.
    set.seed(2)
    clusters_HI_Oil_rrr <- kmeans(HI_vector_rrr, centers = num_cluster_Oil, nstart = 25)
    #Save the cluster number of HIrrr
    HI_level_fulllist <- as.factor(clusters_HI_Oil_rrr$cluster)
    HInon0_stackinfo_Oil$HI_level <- HI_level_fulllist[1:length(HInon0_stackinfo_Oil$HrlyHeatInput)]
    which_cluster <- as.numeric(HI_level_fulllist[(length(HInon0_stackinfo_Oil$HrlyHeatInput)+1)])
    HInon0_stackinfo_Oil_HI_lvli = HInon0_stackinfo_Oil %>% filter(HI_level == which_cluster)
    #sort small to large (ascendiOil)
    HInon0_stackinfo_Oil_HI_lvli_sorted <- HInon0_stackinfo_Oil_HI_lvli[order(HInon0_stackinfo_Oil_HI_lvli$"HrlyHeatInput"),]
    #Lastly, pick stack parameters (STKHGT, STKTEMP, STKFLOW and STKDIAM) at random noise within a reasonable raOile around the hourly heat input percentile in the current capacity cluster
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)

    #get the percentile HIrrr locate for this cluster
    pct = ecdf(HInon0_stackinfo_Oil_HI_lvli_sorted$HrlyHeatInput)(HIrrr) #fractional
    pct_w_noise = jitter(pct, factor=1, amount = 0.2*pct) #could be <0 or >1
    #limit pct_w_noise within 0~1
    pct_w_noise = pmax(pmin(pct_w_noise, 1.0), 0)
    print("pct_w_noise")
    print(pct_w_noise)
    
    select_row_index = round((pct_w_noise * nrow(HInon0_stackinfo_Oil_HI_lvli_sorted)),digit=0) 
    #limit select_row_index >=1
    select_row_index = pmax(select_row_index, 1)
    CEMS_NEI_notmatch_Oil[rrr, "STKHGT" ] = HInon0_stackinfo_Oil_HI_lvli_sorted[select_row_index, "STKHGT"]
    CEMS_NEI_notmatch_Oil[rrr, "STKTEMP"] = HInon0_stackinfo_Oil_HI_lvli_sorted[select_row_index, "STKTEMP"]
    CEMS_NEI_notmatch_Oil[rrr, "STKFLOW"] = HInon0_stackinfo_Oil_HI_lvli_sorted[select_row_index, "STKFLOW"]
    CEMS_NEI_notmatch_Oil[rrr, "STKDIAM"] = HInon0_stackinfo_Oil_HI_lvli_sorted[select_row_index, "STKDIAM"]
    CEMS_NEI_notmatch_Oil[rrr, "STKVEL" ] = CEMS_NEI_notmatch_Oil[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_Oil[rrr, "STKDIAM"]/2)^2)
  }

  #Then, based on its hourly heat input decide which capacity cluster it beloOils, note: hourly heat input can be zero
  if (HIrrr == 0){
    #heat input = 0 is a special case, just use average values from other 0 heat input cases with the same fuel type
    #and calculate STKVEL = STKFLOW/(pi*(STKDIAM/2)^2)
    CEMS_NEI_notmatch_Oil[rrr, "STKHGT" ] = mean(HI0_stackinfo_Oil$STKHGT)
    CEMS_NEI_notmatch_Oil[rrr, "STKTEMP"] = mean(HI0_stackinfo_Oil$STKTEMP)
    CEMS_NEI_notmatch_Oil[rrr, "STKFLOW"] = mean(HI0_stackinfo_Oil$STKFLOW)
    CEMS_NEI_notmatch_Oil[rrr, "STKDIAM"] = mean(HI0_stackinfo_Oil$STKDIAM)
    CEMS_NEI_notmatch_Oil[rrr, "STKVEL" ] = CEMS_NEI_notmatch_Oil[rrr, "STKFLOW"]/(pi*(CEMS_NEI_notmatch_Oil[rrr, "STKDIAM"]/2)^2)
  }
}

CEMS_NEI_notmatch_Astackinfo = rbind(CEMS_NEI_notmatch_NG, CEMS_NEI_notmatch_Coal, CEMS_NEI_notmatch_Oil)

#Add other useful information such as ORISPL, ORISUN, LAT_CEMS, LON_CEMS, CEMS reported NOx, SO2, CO2 monthly emissions
CEMS_NEI_notmatch_Astack = join(CEMS_NEI_notmatch,CEMS_NEI_notmatch_Astackinfo, by=c("ORIS_ID","Fuel","HrlyHeatInput"), type="left")

#simplify CEMS_NEI_notmatch_Astack
CEMS_NEI_notmatch_Astack = subset(CEMS_NEI_notmatch_Astack, select=c("ORIS_ID","ORISPL","ORISUN","LON_CEMS","LAT_CEMS","Fuel","Annual_Operating_Time_hr","Annual_Heat_Input_MMBtu","Annual_NOx_Emis_MetricTon","Annual_SO2_Emis_MetricTon","Annual_CO2_Emis_MetricTon","STKHGT","STKTEMP","STKFLOW","STKDIAM","STKVEL"))

write.csv(CEMS_NEI_notmatch_Astack, file = "CEMS_NEI_notmatch_Astack_202108.csv")